library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
crime_df = readRDS(file = "datasets/nyc_felony_crimes.rds")
url = "https://www.census.gov/quickfacts/fact/table/newyorkcitynewyork,bronxcountybronxboroughnewyork,kingscountybrooklynboroughnewyork,newyorkcountymanhattanboroughnewyork,queenscountyqueensboroughnewyork,richmondcountystatenislandboroughnewyork/PST045217"
boro_pop = read_html(url)

boro_pop %>%
  html_nodes(css = "table")
## {xml_nodeset (4)}
## [1] <table class="type" data-geospan="6">\n<colgroup>\n<col>\n<col>\n<co ...
## [2] <table class="type" data-geospan="6">\n<caption class="icon-user-3"> ...
## [3] <table class="type" data-geospan="6">\n<caption class="icon-industry ...
## [4] <table class="type" data-geospan="6">\n<caption class="icon-globe-2" ...
pop_df = (boro_pop %>% html_nodes(css = "table")) %>% 
  .[[1]] %>%
  html_table() %>% 
  as_tibble() %>% 
 janitor::clean_names()

names(pop_df)[1:7] = c("estimate_date", "new_york_city", "bronx", "brooklyn", "manhattan", "queens", "staten_island")

pop_df = pop_df %>% 
  gather(key = boro_nm, value = population, estimate_date:staten_island) %>% 
  mutate(population = if_else(population == "Population estimates, July 1, 2017,  (V2017)", "2017", population),
         population = as.numeric(gsub("," , "", population)))
nycc_df = crime_df %>% 
  mutate(boro_nm = if_else(boro_nm == "staten island", "staten_island", boro_nm))

Looking at the individual offenses by borough over the four years:

crime_df %>% 
  distinct(pd_cd) %>% 
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1   221
combined_df = nycc_df %>% group_by(boro_nm, pd_cd, year) %>% 
  summarise(count = n())


full = left_join(combined_df, pop_df, by = "boro_nm") %>% 
mutate(off_rate = ((count/population)*100)) 

full[,'off_rate'] = round(full[,'off_rate'],2)

test <- sqldf("SELECT boro_nm, year, off_rate
              FROM full
              WHERE off_rate IS NOT NULL and year = '2017'
              ORDER BY boro_nm, off_rate DESC")

test_2017 <- test[!duplicated(test$boro_nm), ]


test2 <- sqldf("SELECT boro_nm, year, off_rate
              FROM full
              WHERE off_rate IS NOT NULL and year = '2016'
              ORDER BY boro_nm, off_rate DESC")

test_2016 <- test2[!duplicated(test2$boro_nm), ]


test3 <- sqldf("SELECT boro_nm, year, off_rate
              FROM full
              WHERE off_rate IS NOT NULL and year = '2015'
              ORDER BY boro_nm, off_rate DESC")

test_2015 <- test3[!duplicated(test3$boro_nm), ]


test4 <- sqldf("SELECT boro_nm, year, off_rate
              FROM full
              WHERE off_rate IS NOT NULL and year = '2014'
              ORDER BY boro_nm, off_rate DESC")

test_2014 <- test4[!duplicated(test4$boro_nm), ]

test_new = rbind(test_2017, test_2016, test_2015, test_2014)
 

trend = ggplot(test_new, aes(x = boro_nm, y = off_rate, color = year)) +
geom_point() + 
theme(legend.position = "bottom") +
  labs(
  x = "Borough",
  y = "Offense rate",
  caption = "NYC_Crime"
  ) +
geom_smooth(se = FALSE) +
theme(legend.position = "bottom")

ggplotly(trend)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'